{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置driver"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#准备工作\n",
    "import pandas as pd\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-2-8ae4025e7ff4>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 打开CNKI\n",
    "* 1.校园网，自动登录cnki.net\n",
    "* 2.校外网，需要登录fsso.cnki.net"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get('https://www.cnki.net/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 检查是否是中山大学南方学院登录（检查中山大学南方学院资源、保证后续可以下载）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学南...'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.find_element_by_id('Ecp_loginShowName1').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 打开高级检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_id('highSearch')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 切换窗口"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 检查窗口\n",
    "眼见不一定为实"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-BA2766A93AA6B71E3BE76B0E3198EE77'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#当前窗口ID\n",
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-BA2766A93AA6B71E3BE76B0E3198EE77',\n",
       " 'CDwindow-EBDC575D0939669B3A6B2B4A25122B12']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-8-2c997ac77236>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "#切换窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选择“学术期刊”"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a/span')\n",
    "element.get_attribute(\"innerHTML\")\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选择专业检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_name('majorSearch')\n",
    "element.get_attribute(\"innerHTML\")\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选择期刊来源，分别勾选SCI、CSSCI、CSCD、EI "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sci\n",
    "element=driver.find_element_by_xpath('//input[@key=\"CSI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#cssci\n",
    "element=driver.find_element_by_xpath('//input[@key=\"SI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#cscd\n",
    "element=driver.find_element_by_xpath('//input[@key=\"CSD\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#EI\n",
    "element=driver.find_element_by_xpath('//input[@key=\"EI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置搜索query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = 'SU = \"新媒体\" AND TI =\"网络\" '"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//input[@value=\"检索\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查检索文章总数量的信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'共找到<em>903</em>条结果'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#输出检索文章数量\n",
    "element=driver.find_element_by_xpath('//span[@class=\"pagerTitleCell\"]')\n",
    "element.get_attribute(\"innerHTML\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置页面显示文章数量并设置正序排序（每页50条结果）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击更换显示页数\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-sort\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#将其设置为50页\n",
    "element = driver.find_element_by_xpath('//div[@id=\"perPageDiv\"]//li[@data-val=\"50\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#按发表时间正序排序\n",
    "element =driver.find_element_by_xpath('//div[@class=\"order-group\"]/ul/li[@class=\"descend cur\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1/19'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#检索文章总页数？\n",
    "element = driver.find_element_by_xpath('//span[@class=\"countPageMark\"]')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 存入下载链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import requests\n",
    "import requests_html\n",
    "from requests_html import HTMLSession\n",
    "from lxml import etree\n",
    "from lxml.html import fromstring\n",
    "from urllib.parse import urljoin\n",
    "from random import random\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "import base64\n",
    "import json\n",
    "from PIL import Image\n",
    "\n",
    "import selenium\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "from selenium.webdriver.common.by import By"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "HOST = \"https://kns.cnki.net/\"\n",
    "html = driver.page_source\n",
    "soup = etree.HTML(html)\n",
    "tr_list= soup.xpath('//div[@id=\"gridTable\"]/table/tbody/tr')\n",
    "headers = {\n",
    "    \"Cookie\":'Ecp_ClientId=3200615112602316607; cnkiUserKey=03369524-5653-0508-82c6-2fdbcdb21fe4; RsPerPage=20; _pk_ref=%5B%22%22%2C%22%22%2C1607047617%2C%22https%3A%2F%2Fwww.cnki.net%2F%22%5D; Ecp_ClientIp=202.116.81.140; UM_distinctid=178689abc586d3-0e68ae30691535-5771133-1fa400-178689abc597f5; Ecp_loginuserjf=15014134753; Ecp_session=1; ASP.NET_SessionId=gt3afeursglwft12mavvuvzf; SID_kns8=123111; SID_kns_new=kns123106; CurrSortFieldType=desc; SID_kcms=124103; SID_kxreader_new=011121; Hm_lvt_6e967eb120601ea41b9d312166416aa6=1621474773; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22179876d894d472-0e2dc24144c1fc-2363163-2073600-179876d894e760%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22%24device_id%22%3A%22179876d894d472-0e2dc24144c1fc-2363163-2073600-179876d894e760%22%7D; Hm_lvt_ba7af201fc75865e9846f701ccb53e6b=1621474773; SID_kns=025123113; SID_klogin=125144; Hm_lpvt_6e967eb120601ea41b9d312166416aa6=1621475961; Hm_lpvt_ba7af201fc75865e9846f701ccb53e6b=1621475961; Ecp_loginuserbk=GDZSDX; knsLeftGroupSelectItem=null5%3B9%3B; CurrSortField=%e8%a2%ab%e5%bc%95%2f(%e8%a2%ab%e5%bc%95%e9%a2%91%e6%ac%a1%2c%27integer%27); _pk_ref=%5B%22%22%2C%22%22%2C1622444666%2C%22https%3A%2F%2Fwww.cnki.net%2F%22%5D; _pk_ses=*; _pk_id=90adc29b-dc70-48ff-8dcc-60e442d5a980.1619271339.5.1622444668.1622444666.; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"GZ0513\",\"ShowName\":\"%e4%b8%ad%e5%b1%b1%e5%a4%a7%e5%ad%a6%e5%8d%97%e6%96%b9%e5%ad%a6%e9%99%a2\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"4SVIml\"}; LID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVdSTUtBeko5WUtLcE8xUldvbWhxRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVdSTUtBeko5WUtLcE8xUldvbWhxRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/31/2021 15:24:33; c_m_expire=2021-05-31 15:24:33',\n",
    "    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'\n",
    "}\n",
    "def get_data(href):   \n",
    "    DbCode = re.findall(\"DbCode=(.*?)&\",href)[0]\n",
    "    dbname = re.findall(\"dbname=(.*?)&\",href)[0]\n",
    "    filename = re.findall(\"filename=(.*?)&\",href)[0]\n",
    "    href = f\"https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={DbCode}&dbname={dbname}&filename={filename}\"\n",
    "    return href\n",
    "\n",
    "def download(href):   \n",
    "    page_source = requests.get(url=href,headers=headers).text\n",
    "    page_source_soup = etree.HTML(page_source)\n",
    "    download_url = page_source_soup.xpath('//li[@class=\"btn-dlpdf\"]/a/@href')\n",
    "    if download_url:\n",
    "        return urljoin(HOST,download_url[0])\n",
    "    else:\n",
    "        return \"无下载链接\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_pages =19   \n",
    "\n",
    "title_list = []\n",
    "author_list = []\n",
    "href_list = []\n",
    "download_url_list = []\n",
    "\n",
    "\n",
    "def main():\n",
    "    now_page_count = 1\n",
    "    while now_page_count < num_pages:\n",
    "        html = driver.page_source\n",
    "        soup = etree.HTML(html)\n",
    "        tr_list= soup.xpath('//div[@id=\"gridTable\"]/table/tbody/tr')\n",
    "        for tr in tr_list:\n",
    "            title = \"\".join(tr.xpath('td[@class=\"name\"]//text()')).strip().replace(\"\\n\",\"\").replace(\"                              网络首发\",\"\")\n",
    "            author = \",\".join(tr.xpath('td[@class=\"author\"]/a//text()')).strip().replace(\"\\n\",\"\")\n",
    "            href = tr.xpath('td[@class=\"name\"]/a/@href')[0]\n",
    "            href = get_data(href)\n",
    "            download_url = download(href)\n",
    "            title_list.append(title)\n",
    "            author_list.append(author)\n",
    "            href_list.append(href)\n",
    "            download_url_list.append(download_url)\n",
    "        now_page_count = now_page_count + 1\n",
    "        driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click()\n",
    "        time.sleep(0.5)\n",
    "    data = {\"标题\":title_list,\"作者\":author_list,\"详细页链接\":href_list,\"PDF下载链接\":download_url_list}\n",
    "    data = pd.DataFrame(data)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>作者</th>\n",
       "      <th>详细页链接</th>\n",
       "      <th>PDF下载链接</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>寻求机遇  再造辉煌——网络传播时代中国电视业的生存及发展探析</td>\n",
       "      <td>程莉</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>网络时代的对话与交流——新媒体技术2000年报告会内容纪要</td>\n",
       "      <td></td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>发展新媒体的若干思考——由世界网络所想到的</td>\n",
       "      <td>林涛</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>论网络广播──网络广播现状和经营理念</td>\n",
       "      <td>杨叶青</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述</td>\n",
       "      <td></td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>895</th>\n",
       "      <td>嵌入基层治理：县级融媒体中心与基层网络政务服务的融合发展</td>\n",
       "      <td>谢新洲,石林</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>896</th>\n",
       "      <td>长城新媒体集团融合创新春晚形态——“河北网络春节云联欢”云端放异彩</td>\n",
       "      <td>李建,田少华,李遥</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>897</th>\n",
       "      <td>新媒体时代图书网络营销矩阵建设实务研究</td>\n",
       "      <td>郑丽珠</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>898</th>\n",
       "      <td>嬗变、冲突与重构：新媒体视域下的网络舆论</td>\n",
       "      <td>陈晓伟,董烁</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>899</th>\n",
       "      <td>“转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析</td>\n",
       "      <td>黄月琴,黄宪成</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>900 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    标题         作者  \\\n",
       "0      寻求机遇  再造辉煌——网络传播时代中国电视业的生存及发展探析         程莉   \n",
       "1        网络时代的对话与交流——新媒体技术2000年报告会内容纪要              \n",
       "2                发展新媒体的若干思考——由世界网络所想到的         林涛   \n",
       "3                   论网络广播──网络广播现状和经营理念        杨叶青   \n",
       "4     新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述              \n",
       "..                                 ...        ...   \n",
       "895       嵌入基层治理：县级融媒体中心与基层网络政务服务的融合发展     谢新洲,石林   \n",
       "896  长城新媒体集团融合创新春晚形态——“河北网络春节云联欢”云端放异彩  李建,田少华,李遥   \n",
       "897                新媒体时代图书网络营销矩阵建设实务研究        郑丽珠   \n",
       "898               嬗变、冲突与重构：新媒体视域下的网络舆论     陈晓伟,董烁   \n",
       "899  “转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析    黄月琴,黄宪成   \n",
       "\n",
       "                                                 详细页链接  \\\n",
       "0    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "1    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "2    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "3    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "4    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "..                                                 ...   \n",
       "895  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "896  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "897  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "898  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "899  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "\n",
       "                                               PDF下载链接  \n",
       "0    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "1    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "2    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "3    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "4    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "..                                                 ...  \n",
       "895  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "896  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "897  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "898  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "899  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "\n",
       "[900 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "data = main()\n",
    "display(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "#让我们把这些数据导入excel，方便后续使用\n",
    "data.to_excel('cnki下载文章.xlsx',sheet_name=\"pdf\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 抓取详细页面信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>寻求机遇 再造辉煌——网络传播时代中国电视业的生存及发展探析</td>\n",
       "      <td>程莉</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2000-07-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>52</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>网络时代的对话与交流——新媒体技术2000年报告会内容纪要</td>\n",
       "      <td>钟新</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2000-09-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>235</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>发展新媒体的若干思考——由世界网络所想到的</td>\n",
       "      <td>林涛</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2000-11-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>76</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>论网络广播──网络广播现状和经营理念</td>\n",
       "      <td>杨叶青</td>\n",
       "      <td>现代传播-北京广播学院学报</td>\n",
       "      <td>2000-12-15</td>\n",
       "      <td>13.0</td>\n",
       "      <td>344</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述</td>\n",
       "      <td>王蕾</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2001-06-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>132</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>面对网络时代,广播如何应对?</td>\n",
       "      <td>罗湘萍</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2001-06-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>37</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>论高校网络传播的建设和管理</td>\n",
       "      <td>彭凤仪</td>\n",
       "      <td>浙江大学学报(人文社会科学版)</td>\n",
       "      <td>2001-12-25</td>\n",
       "      <td>15.0</td>\n",
       "      <td>166</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>新媒体 新挑战 新发展——首届海峡两岸网络与影视经营管理研讨会综述</td>\n",
       "      <td>许正林,范荣霞,张政方</td>\n",
       "      <td>上海大学学报(社会科学版)</td>\n",
       "      <td>2003-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>从网络新闻的运作看新旧媒体的互动</td>\n",
       "      <td>阎瑜</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2005-07-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>194</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>“网络电视风暴”背后——兼论数字新媒体开发制度安排的重要性与紧迫性</td>\n",
       "      <td>高子华</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2005-07-25</td>\n",
       "      <td>7.0</td>\n",
       "      <td>229</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>新媒体视野中的网络文学</td>\n",
       "      <td>金振邦</td>\n",
       "      <td>东北师大学报</td>\n",
       "      <td>2005-09-20</td>\n",
       "      <td>25.0</td>\n",
       "      <td>1645</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>虚拟经济环境下网络媒体的盈利模式</td>\n",
       "      <td>方翔</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2006-04-05</td>\n",
       "      <td>6.0</td>\n",
       "      <td>482</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>传统图书与网络的新媒体出版趋势</td>\n",
       "      <td>曹欣渊</td>\n",
       "      <td>编辑学刊</td>\n",
       "      <td>2006-12-15</td>\n",
       "      <td>14.0</td>\n",
       "      <td>384</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>北京城市学院“网络传播与新媒体”专业</td>\n",
       "      <td>NaN</td>\n",
       "      <td>现代教育技术</td>\n",
       "      <td>2007-01-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>123</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>北京城市学院“网络传播与新媒体”专业2007年毕业44人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>现代教育技术</td>\n",
       "      <td>2007-02-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>中央电视台网络新媒体的探索与实践</td>\n",
       "      <td>汪文斌</td>\n",
       "      <td>电视研究</td>\n",
       "      <td>2007-03-05</td>\n",
       "      <td>4.0</td>\n",
       "      <td>127</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>2006“中国网络传播学年会”综述</td>\n",
       "      <td>杜骏飞; 文妤</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2007-04-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>569</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>央视重大活动网络新媒体传播策略分析</td>\n",
       "      <td>王秀云</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2007-07-20</td>\n",
       "      <td>2.0</td>\n",
       "      <td>224</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>网络媒体在奥运传播中的合作模式分析</td>\n",
       "      <td>李芳; 邹英; 李明付</td>\n",
       "      <td>中国体育科技</td>\n",
       "      <td>2007-11-10</td>\n",
       "      <td>6.0</td>\n",
       "      <td>341</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>2007“中国网络传播年会”暨“新新论坛”综述</td>\n",
       "      <td>余秀才; 陈少华</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2007-12-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>355</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>2007“中国网络传播年会”暨“新新论坛”综述</td>\n",
       "      <td>余秀才;陈少华</td>\n",
       "      <td>电影艺术</td>\n",
       "      <td>2008-01-05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>140</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>网络内容整合:意义与原则</td>\n",
       "      <td>孙利军</td>\n",
       "      <td>中国编辑</td>\n",
       "      <td>2008-01-10</td>\n",
       "      <td>2.0</td>\n",
       "      <td>264</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>2007年我国网络传播研究综述</td>\n",
       "      <td>付玉辉</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2008-01-15</td>\n",
       "      <td>21.0</td>\n",
       "      <td>2341</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>网络媒体的蓝海战略——兼论新媒体的经营之道</td>\n",
       "      <td>赵志立</td>\n",
       "      <td>当代传播</td>\n",
       "      <td>2008-01-15</td>\n",
       "      <td>15.0</td>\n",
       "      <td>653</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>网络影响力:中国电视的新型评价体系</td>\n",
       "      <td>李岭涛; 黄宝书</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2008-06-15</td>\n",
       "      <td>11.0</td>\n",
       "      <td>692</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>媒体生态：西方社会中的网络反文化</td>\n",
       "      <td>黄鸣奋</td>\n",
       "      <td>学术月刊</td>\n",
       "      <td>2008-07-20</td>\n",
       "      <td>2.0</td>\n",
       "      <td>526</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>网络人种的内心世界及其社会变革</td>\n",
       "      <td>马莉</td>\n",
       "      <td>中国青年研究</td>\n",
       "      <td>2008-08-05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>77</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>论网络新媒体的说服模式——以“虎照事件”为例</td>\n",
       "      <td>张杰</td>\n",
       "      <td>兰州学刊</td>\n",
       "      <td>2008-08-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>536</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>网络恶搞的“第三人效果”研究——以1772位上海市民为例</td>\n",
       "      <td>禹卫华</td>\n",
       "      <td>新闻与传播研究</td>\n",
       "      <td>2008-08-15</td>\n",
       "      <td>10.0</td>\n",
       "      <td>2590</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>新媒体思考：我国网络传播的现状与趋势</td>\n",
       "      <td>熊澄宇</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2008-08-20</td>\n",
       "      <td>23.0</td>\n",
       "      <td>1581</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>网络版权营销:广电网站盈利思路探索——以四川广电神韵在线网站进行网络版权营销为例</td>\n",
       "      <td>李胜; 谢慧</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2008-10-15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>175</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>我国网络新媒体开发应用取得突破</td>\n",
       "      <td>NaN</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2008-11-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>119</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>从体育电视的网络传播实践看“台网互动”的基本模式</td>\n",
       "      <td>雷蔚真; 谢弛</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2008-11-23</td>\n",
       "      <td>13.0</td>\n",
       "      <td>734</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>联盟互动 借船造势 打造网络新媒体传播平台——央视网奥运报道之回顾与思考</td>\n",
       "      <td>汪文斌</td>\n",
       "      <td>电视研究</td>\n",
       "      <td>2008-12-05</td>\n",
       "      <td>9.0</td>\n",
       "      <td>225</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>期刊网络品牌的保护与推广</td>\n",
       "      <td>贾亚洲</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2008-12-08</td>\n",
       "      <td>12.0</td>\n",
       "      <td>206</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>论网络媒体对当代大学生消费行为的影响</td>\n",
       "      <td>陈冲; 张丰</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2008-12-15</td>\n",
       "      <td>11.0</td>\n",
       "      <td>1462</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>再议网络出版的发展现状及其未来趋势</td>\n",
       "      <td>李江涛</td>\n",
       "      <td>中国出版</td>\n",
       "      <td>2008-12-15</td>\n",
       "      <td>9.0</td>\n",
       "      <td>354</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>北京奥运会网络视频传播状况的回顾与分析</td>\n",
       "      <td>王相飞</td>\n",
       "      <td>体育学刊</td>\n",
       "      <td>2009-02-28</td>\n",
       "      <td>25.0</td>\n",
       "      <td>513</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>新媒体技术的标准化战略:基于“行动者网络理论”</td>\n",
       "      <td>裘涵</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2009-03-20</td>\n",
       "      <td>4.0</td>\n",
       "      <td>572</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>网络媒体与传统媒体议程互动——以杨丽娟事件为例</td>\n",
       "      <td>陶贤都; 隋明晓</td>\n",
       "      <td>华中科技大学学报(社会科学版)</td>\n",
       "      <td>2009-03-25</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1753</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>全球化视野下新媒体与传统媒体融合问题研究——以电视媒体网络媒体为例</td>\n",
       "      <td>戴程</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2009-04-15</td>\n",
       "      <td>53.0</td>\n",
       "      <td>3233</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>网络媒体的内容把关和舆论引导</td>\n",
       "      <td>杨清波</td>\n",
       "      <td>当代传播</td>\n",
       "      <td>2009-05-15</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1086</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>传统主流媒体应在网络热点问题中彰显作为</td>\n",
       "      <td>王忠奇</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2009-06-20</td>\n",
       "      <td>4.0</td>\n",
       "      <td>128</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>网络视频发展的政策选择</td>\n",
       "      <td>杨明品; 贺筱玲</td>\n",
       "      <td>电视研究</td>\n",
       "      <td>2009-07-05</td>\n",
       "      <td>2.0</td>\n",
       "      <td>269</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>探索广播报道新思路——以《CRI观察》网络视频直播节目为例</td>\n",
       "      <td>邓黎</td>\n",
       "      <td>中国编辑</td>\n",
       "      <td>2009-07-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>187</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>新媒体时代网络音乐文化传播特征解析</td>\n",
       "      <td>陈辉</td>\n",
       "      <td>中国音乐学</td>\n",
       "      <td>2009-07-15</td>\n",
       "      <td>43.0</td>\n",
       "      <td>1537</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>政治、社会与新型网络应用——2008年中国网络传播研究的关键主题</td>\n",
       "      <td>杜骏飞</td>\n",
       "      <td>中国地质大学学报(社会科学版)</td>\n",
       "      <td>2009-07-15</td>\n",
       "      <td>15.0</td>\n",
       "      <td>1235</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>1994年以来中国网络新闻传播理论研究进展分析</td>\n",
       "      <td>杜骏飞</td>\n",
       "      <td>上海师范大学学报(哲学社会科学版)</td>\n",
       "      <td>2009-07-25</td>\n",
       "      <td>21.0</td>\n",
       "      <td>2647</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>成长优势与行业困局:网络电视实证调查研究</td>\n",
       "      <td>蒋宁平</td>\n",
       "      <td>中国电视</td>\n",
       "      <td>2009-08-15</td>\n",
       "      <td>10.0</td>\n",
       "      <td>411</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>网络文学:盛宴背后的审美伦理问题</td>\n",
       "      <td>欧阳友权</td>\n",
       "      <td>探索与争鸣</td>\n",
       "      <td>2009-08-15</td>\n",
       "      <td>20.0</td>\n",
       "      <td>1321</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                        篇名           作者  \\\n",
       "0            1            寻求机遇 再造辉煌——网络传播时代中国电视业的生存及发展探析           程莉   \n",
       "1            2             网络时代的对话与交流——新媒体技术2000年报告会内容纪要           钟新   \n",
       "2            3                     发展新媒体的若干思考——由世界网络所想到的           林涛   \n",
       "3            4                        论网络广播──网络广播现状和经营理念          杨叶青   \n",
       "4            5          新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述           王蕾   \n",
       "5            6                            面对网络时代,广播如何应对?          罗湘萍   \n",
       "6            7                             论高校网络传播的建设和管理          彭凤仪   \n",
       "7            8         新媒体 新挑战 新发展——首届海峡两岸网络与影视经营管理研讨会综述  许正林,范荣霞,张政方   \n",
       "8            9                          从网络新闻的运作看新旧媒体的互动           阎瑜   \n",
       "9           10         “网络电视风暴”背后——兼论数字新媒体开发制度安排的重要性与紧迫性          高子华   \n",
       "10          11                               新媒体视野中的网络文学          金振邦   \n",
       "11          12                          虚拟经济环境下网络媒体的盈利模式           方翔   \n",
       "12          13                           传统图书与网络的新媒体出版趋势          曹欣渊   \n",
       "13          14                        北京城市学院“网络传播与新媒体”专业          NaN   \n",
       "14          15              北京城市学院“网络传播与新媒体”专业2007年毕业44人          NaN   \n",
       "15          16                          中央电视台网络新媒体的探索与实践          汪文斌   \n",
       "16          17                         2006“中国网络传播学年会”综述      杜骏飞; 文妤   \n",
       "17          18                         央视重大活动网络新媒体传播策略分析          王秀云   \n",
       "18          19                         网络媒体在奥运传播中的合作模式分析  李芳; 邹英; 李明付   \n",
       "19          20                   2007“中国网络传播年会”暨“新新论坛”综述     余秀才; 陈少华   \n",
       "20          21                   2007“中国网络传播年会”暨“新新论坛”综述      余秀才;陈少华   \n",
       "21          22                              网络内容整合:意义与原则          孙利军   \n",
       "22          23                           2007年我国网络传播研究综述          付玉辉   \n",
       "23          24                     网络媒体的蓝海战略——兼论新媒体的经营之道          赵志立   \n",
       "24          25                         网络影响力:中国电视的新型评价体系     李岭涛; 黄宝书   \n",
       "25          26                          媒体生态：西方社会中的网络反文化          黄鸣奋   \n",
       "26          27                           网络人种的内心世界及其社会变革           马莉   \n",
       "27          28                    论网络新媒体的说服模式——以“虎照事件”为例           张杰   \n",
       "28          29              网络恶搞的“第三人效果”研究——以1772位上海市民为例          禹卫华   \n",
       "29          30                        新媒体思考：我国网络传播的现状与趋势          熊澄宇   \n",
       "30          31  网络版权营销:广电网站盈利思路探索——以四川广电神韵在线网站进行网络版权营销为例       李胜; 谢慧   \n",
       "31          32                           我国网络新媒体开发应用取得突破          NaN   \n",
       "32          33                  从体育电视的网络传播实践看“台网互动”的基本模式      雷蔚真; 谢弛   \n",
       "33          34      联盟互动 借船造势 打造网络新媒体传播平台——央视网奥运报道之回顾与思考          汪文斌   \n",
       "34          35                              期刊网络品牌的保护与推广          贾亚洲   \n",
       "35          36                        论网络媒体对当代大学生消费行为的影响       陈冲; 张丰   \n",
       "36          37                         再议网络出版的发展现状及其未来趋势          李江涛   \n",
       "37          38                       北京奥运会网络视频传播状况的回顾与分析          王相飞   \n",
       "38          39                   新媒体技术的标准化战略:基于“行动者网络理论”           裘涵   \n",
       "39          40                   网络媒体与传统媒体议程互动——以杨丽娟事件为例     陶贤都; 隋明晓   \n",
       "40          41         全球化视野下新媒体与传统媒体融合问题研究——以电视媒体网络媒体为例           戴程   \n",
       "41          42                            网络媒体的内容把关和舆论引导          杨清波   \n",
       "42          43                       传统主流媒体应在网络热点问题中彰显作为          王忠奇   \n",
       "43          44                               网络视频发展的政策选择     杨明品; 贺筱玲   \n",
       "44          45             探索广播报道新思路——以《CRI观察》网络视频直播节目为例           邓黎   \n",
       "45          46                         新媒体时代网络音乐文化传播特征解析           陈辉   \n",
       "46          47          政治、社会与新型网络应用——2008年中国网络传播研究的关键主题          杜骏飞   \n",
       "47          48                   1994年以来中国网络新闻传播理论研究进展分析          杜骏飞   \n",
       "48          49                      成长优势与行业困局:网络电视实证调查研究          蒋宁平   \n",
       "49          50                          网络文学:盛宴背后的审美伦理问题         欧阳友权   \n",
       "\n",
       "                   刊名        发表时间    被引    下载  操作  \n",
       "0            中国广播电视学刊  2000-07-25   1.0    52  下载  \n",
       "1               国际新闻界  2000-09-25   NaN   235  下载  \n",
       "2            中国广播电视学刊  2000-11-25   1.0    76  下载  \n",
       "3       现代传播-北京广播学院学报  2000-12-15  13.0   344  下载  \n",
       "4                新闻记者  2001-06-05   NaN   132  下载  \n",
       "5            中国广播电视学刊  2001-06-25   NaN    37  下载  \n",
       "6     浙江大学学报(人文社会科学版)  2001-12-25  15.0   166  下载  \n",
       "7       上海大学学报(社会科学版)  2003-01-15   NaN   180  下载  \n",
       "8                编辑之友  2005-07-25   NaN   194  下载  \n",
       "9            中国广播电视学刊  2005-07-25   7.0   229  下载  \n",
       "10             东北师大学报  2005-09-20  25.0  1645  下载  \n",
       "11               新闻记者  2006-04-05   6.0   482  下载  \n",
       "12               编辑学刊  2006-12-15  14.0   384  下载  \n",
       "13             现代教育技术  2007-01-01   NaN   123  下载  \n",
       "14             现代教育技术  2007-02-01   NaN    54  下载  \n",
       "15               电视研究  2007-03-05   4.0   127  下载  \n",
       "16     现代传播(中国传媒大学学报)  2007-04-01   NaN   569  下载  \n",
       "17           中国广播电视学刊  2007-07-20   2.0   224  下载  \n",
       "18             中国体育科技  2007-11-10   6.0   341  下载  \n",
       "19     现代传播(中国传媒大学学报)  2007-12-15   2.0   355  下载  \n",
       "20               电影艺术  2008-01-05   1.0   140  下载  \n",
       "21               中国编辑  2008-01-10   2.0   264  下载  \n",
       "22              国际新闻界  2008-01-15  21.0  2341  下载  \n",
       "23               当代传播  2008-01-15  15.0   653  下载  \n",
       "24     现代传播(中国传媒大学学报)  2008-06-15  11.0   692  下载  \n",
       "25               学术月刊  2008-07-20   2.0   526  下载  \n",
       "26             中国青年研究  2008-08-05   1.0    77  下载  \n",
       "27               兰州学刊  2008-08-15   2.0   536  下载  \n",
       "28            新闻与传播研究  2008-08-15  10.0  2590  下载  \n",
       "29           中国广播电视学刊  2008-08-20  23.0  1581  下载  \n",
       "30                新闻界  2008-10-15   1.0   175  下载  \n",
       "31               新闻记者  2008-11-05   NaN   119  下载  \n",
       "32              国际新闻界  2008-11-23  13.0   734  下载  \n",
       "33               电视研究  2008-12-05   9.0   225  下载  \n",
       "34              科技与出版  2008-12-08  12.0   206  下载  \n",
       "35                新闻界  2008-12-15  11.0  1462  下载  \n",
       "36               中国出版  2008-12-15   9.0   354  下载  \n",
       "37               体育学刊  2009-02-28  25.0   513  下载  \n",
       "38           中国广播电视学刊  2009-03-20   4.0   572  下载  \n",
       "39    华中科技大学学报(社会科学版)  2009-03-25  33.0  1753  下载  \n",
       "40                新闻界  2009-04-15  53.0  3233  下载  \n",
       "41               当代传播  2009-05-15  30.0  1086  下载  \n",
       "42           中国广播电视学刊  2009-06-20   4.0   128  下载  \n",
       "43               电视研究  2009-07-05   2.0   269  下载  \n",
       "44               中国编辑  2009-07-10   NaN   187  下载  \n",
       "45              中国音乐学  2009-07-15  43.0  1537  下载  \n",
       "46    中国地质大学学报(社会科学版)  2009-07-15  15.0  1235  下载  \n",
       "47  上海师范大学学报(哲学社会科学版)  2009-07-25  21.0  2647  下载  \n",
       "48               中国电视  2009-08-15  10.0   411  下载  \n",
       "49              探索与争鸣  2009-08-15  20.0  1321  下载  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#抓取第一页页面信息\n",
    "element=driver.find_element_by_id('gridTable')\n",
    "含有页面主要数据的表格_HTML=element.get_attribute(\"innerHTML\")\n",
    "数据 = pd.read_html(含有页面主要数据的表格_HTML)[0]\n",
    "数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 翻页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute(\"innerHTML\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "表格_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]\n"
     ]
    }
   ],
   "source": [
    "#查看所有页数\n",
    "pages = list(range(1,19))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        下一页 = driver.find_element_by_id('PageNext')\n",
    "        下一页.click()\n",
    "        time.sleep(20+10*random())\n",
    "        \n",
    "        #获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        表格_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "13  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "14  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "15  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "16  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "17  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "18  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = pd.DataFrame([表格_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "网站 = \"中国知网\"\n",
    "fn = { \"output\" : { \"htm_snippets\": \"data/htm_snippets_{网站}.tsv\"}\n",
    "     }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格 = pd.read_html(表格_html[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>寻求机遇 再造辉煌——网络传播时代中国电视业的生存及发展探析</td>\n",
       "      <td>程莉</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2000-07-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>网络时代的对话与交流——新媒体技术2000年报告会内容纪要</td>\n",
       "      <td>钟新</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2000-09-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>235.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>发展新媒体的若干思考——由世界网络所想到的</td>\n",
       "      <td>林涛</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2000-11-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>论网络广播──网络广播现状和经营理念</td>\n",
       "      <td>杨叶青</td>\n",
       "      <td>现代传播-北京广播学院学报</td>\n",
       "      <td>2000-12-15</td>\n",
       "      <td>13.0</td>\n",
       "      <td>344.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述</td>\n",
       "      <td>王蕾</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2001-06-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>132.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>848</th>\n",
       "      <td>899</td>\n",
       "      <td>嬗变、冲突与重构：新媒体视域下的网络舆论</td>\n",
       "      <td>陈晓伟; 董烁</td>\n",
       "      <td>中国编辑</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>349.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>849</th>\n",
       "      <td>900</td>\n",
       "      <td>“转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析</td>\n",
       "      <td>黄月琴; 黄宪成</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>632.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>850</th>\n",
       "      <td>901</td>\n",
       "      <td>新媒体时代高校网络舆情引导机制探析</td>\n",
       "      <td>孙璐</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>851</th>\n",
       "      <td>902</td>\n",
       "      <td>新媒体环境下网络广告创意设计研究——评《新媒体时代下的网络广告设计应用》</td>\n",
       "      <td>赵静静</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>852</th>\n",
       "      <td>903</td>\n",
       "      <td>加强新时代高校网络思想政治教育探究</td>\n",
       "      <td>高歌</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-06-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>903 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                    篇名        作者  \\\n",
       "0             1        寻求机遇 再造辉煌——网络传播时代中国电视业的生存及发展探析        程莉   \n",
       "1             2         网络时代的对话与交流——新媒体技术2000年报告会内容纪要        钟新   \n",
       "2             3                 发展新媒体的若干思考——由世界网络所想到的        林涛   \n",
       "3             4                    论网络广播──网络广播现状和经营理念       杨叶青   \n",
       "4             5      新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述        王蕾   \n",
       "..          ...                                   ...       ...   \n",
       "848         899                  嬗变、冲突与重构：新媒体视域下的网络舆论   陈晓伟; 董烁   \n",
       "849         900     “转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析  黄月琴; 黄宪成   \n",
       "850         901                     新媒体时代高校网络舆情引导机制探析        孙璐   \n",
       "851         902  新媒体环境下网络广告创意设计研究——评《新媒体时代下的网络广告设计应用》       赵静静   \n",
       "852         903                     加强新时代高校网络思想政治教育探究        高歌   \n",
       "\n",
       "                刊名        发表时间    被引     下载  操作  \n",
       "0         中国广播电视学刊  2000-07-25   1.0   52.0  下载  \n",
       "1            国际新闻界  2000-09-25   NaN  235.0  下载  \n",
       "2         中国广播电视学刊  2000-11-25   1.0   76.0  下载  \n",
       "3    现代传播-北京广播学院学报  2000-12-15  13.0  344.0  下载  \n",
       "4             新闻记者  2001-06-05   NaN  132.0  下载  \n",
       "..             ...         ...   ...    ...  ..  \n",
       "848           中国编辑  2021-05-10   NaN  349.0  下载  \n",
       "849           新闻记者  2021-05-20   NaN  632.0  下载  \n",
       "850          新闻爱好者  2021-06-20   NaN    NaN  下载  \n",
       "851          新闻爱好者  2021-06-20   NaN    NaN  下载  \n",
       "852      学校党建与思想教育  2021-06-23   NaN    5.0  下载  \n",
       "\n",
       "[903 rows x 8 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_表格 = 数据.append(df_url_out)\n",
    "df_表格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>寻求机遇 再造辉煌——网络传播时代中国电视业的生存及发展探析</td>\n",
       "      <td>程莉</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2000-07-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>网络时代的对话与交流——新媒体技术2000年报告会内容纪要</td>\n",
       "      <td>钟新</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2000-09-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>235.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>发展新媒体的若干思考——由世界网络所想到的</td>\n",
       "      <td>林涛</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2000-11-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>论网络广播──网络广播现状和经营理念</td>\n",
       "      <td>杨叶青</td>\n",
       "      <td>现代传播-北京广播学院学报</td>\n",
       "      <td>2000-12-15</td>\n",
       "      <td>13.0</td>\n",
       "      <td>344.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述</td>\n",
       "      <td>王蕾</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2001-06-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>132.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>848</th>\n",
       "      <td>899</td>\n",
       "      <td>嬗变、冲突与重构：新媒体视域下的网络舆论</td>\n",
       "      <td>陈晓伟; 董烁</td>\n",
       "      <td>中国编辑</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>349.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>849</th>\n",
       "      <td>900</td>\n",
       "      <td>“转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析</td>\n",
       "      <td>黄月琴; 黄宪成</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>632.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>850</th>\n",
       "      <td>901</td>\n",
       "      <td>新媒体时代高校网络舆情引导机制探析</td>\n",
       "      <td>孙璐</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>851</th>\n",
       "      <td>902</td>\n",
       "      <td>新媒体环境下网络广告创意设计研究——评《新媒体时代下的网络广告设计应用》</td>\n",
       "      <td>赵静静</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>852</th>\n",
       "      <td>903</td>\n",
       "      <td>加强新时代高校网络思想政治教育探究</td>\n",
       "      <td>高歌</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-06-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>903 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                    篇名        作者  \\\n",
       "0             1        寻求机遇 再造辉煌——网络传播时代中国电视业的生存及发展探析        程莉   \n",
       "1             2         网络时代的对话与交流——新媒体技术2000年报告会内容纪要        钟新   \n",
       "2             3                 发展新媒体的若干思考——由世界网络所想到的        林涛   \n",
       "3             4                    论网络广播──网络广播现状和经营理念       杨叶青   \n",
       "4             5      新世纪的网络传播发展——“新世纪网络传播发展国际论坛”研讨会综述        王蕾   \n",
       "..          ...                                   ...       ...   \n",
       "848         899                  嬗变、冲突与重构：新媒体视域下的网络舆论   陈晓伟; 董烁   \n",
       "849         900     “转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析  黄月琴; 黄宪成   \n",
       "850         901                     新媒体时代高校网络舆情引导机制探析        孙璐   \n",
       "851         902  新媒体环境下网络广告创意设计研究——评《新媒体时代下的网络广告设计应用》       赵静静   \n",
       "852         903                     加强新时代高校网络思想政治教育探究        高歌   \n",
       "\n",
       "                刊名        发表时间    被引     下载  操作  \n",
       "0         中国广播电视学刊  2000-07-25   1.0   52.0  下载  \n",
       "1            国际新闻界  2000-09-25   NaN  235.0  下载  \n",
       "2         中国广播电视学刊  2000-11-25   1.0   76.0  下载  \n",
       "3    现代传播-北京广播学院学报  2000-12-15  13.0  344.0  下载  \n",
       "4             新闻记者  2001-06-05   NaN  132.0  下载  \n",
       "..             ...         ...   ...    ...  ..  \n",
       "848           中国编辑  2021-05-10   NaN  349.0  下载  \n",
       "849           新闻记者  2021-05-20   NaN  632.0  下载  \n",
       "850          新闻爱好者  2021-06-20   NaN    NaN  下载  \n",
       "851          新闻爱好者  2021-06-20   NaN    NaN  下载  \n",
       "852      学校党建与思想教育  2021-06-23   NaN    5.0  下载  \n",
       "\n",
       "[903 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#把表格保存在本地\n",
    "with pd.ExcelWriter('cnki知网数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_表格.to_excel(writer,sheet_name=\"知网数据\")\n",
    "display(df_表格)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出refworks文件（.txt）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "#返回首页\n",
    "element = driver.find_element_by_id('total').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "导出_html = dict()\n",
    "main_content_ =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
     ]
    }
   ],
   "source": [
    "#因每次下载操作只能500篇，故分两次操作\n",
    "pages = list(range(1,11))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "#选中页面50篇并进行翻页\n",
    "def xuanze (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(20+10*random())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t"
     ]
    }
   ],
   "source": [
    "xuanze (pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "#第一次导出refworks文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-70ACAEC4159F609F82CDB81389AFA4B5',\n",
       " 'CDwindow-700686ADFC2E797AF82D5782D27AD042',\n",
       " 'CDwindow-7BE58775E651D538E8EF418ED53C15B7']"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看所有窗口id\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-48-5e47a15b67dc>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "#切换窗口\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导出txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-50-2c997ac77236>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "#切换窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "#清除所选的500篇原文\n",
    "element = driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[11, 12, 13, 14, 15, 16, 17, 18]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(11,19))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "def xuanze (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(20+10*random())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11\t12\t13\t14\t15\t16\t17\t18\t"
     ]
    }
   ],
   "source": [
    "xuanze(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "#第二次导出refworks文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-70ACAEC4159F609F82CDB81389AFA4B5',\n",
       " 'CDwindow-700686ADFC2E797AF82D5782D27AD042',\n",
       " 'CDwindow-7BE58775E651D538E8EF418ED53C15B7',\n",
       " 'CDwindow-38A6108715A590AA3087227FACF7226E']"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看所有窗口id\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-61-5e47a15b67dc>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "#切换窗口\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导出txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-69-2c997ac77236>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "#切换窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "首页\n"
     ]
    }
   ],
   "source": [
    "#点击回到首页\n",
    "element=driver.find_element_by_xpath('//div[@class=\"pages\"]/a[@id=\"total\"]')\n",
    "print(element.get_attribute('innerHTML'))\n",
    "element.click()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
